* This file follows from clean_081519
* add measures for familiar roads
* the dataset generated from this file is used for robustness check on familiar roads

clear 
clear matrix
set memory 1000m
set more off
cap log close

cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results"

log using log_file/clean_012020.log, replace



************************************
* Generate measure for familiar trip
************************************



use working_data/clean_trip_level_geo_weather_081519.dta, clear



sort user_id trip_id


* construct familiar road measure
gen longitude_end_new=longitude_end
tostring longitude_end_new, force replace
gen long_end=substr(longitude_end_new, 1, 6)
drop longitude_end_new


gen latitude_end_new=latitude_end
tostring latitude_end_new, force replace
gen lat_end=substr(latitude_end_new, 1, 5)
drop latitude_end_new


gen longitude_start_new=longitude_start
tostring longitude_start_new, force replace
gen long_start=substr(longitude_start_new, 1, 6)
drop longitude_start_new


gen latitude_start_new=latitude_start
tostring latitude_start_new, force replace
gen lat_start=substr(latitude_start_new, 1, 5)
drop latitude_start_new

gen end_loc="("+long_end+","+lat_end+")"
gen start_loc="("+long_start+","+lat_start+")"


gen loc_cor=start_loc+end_loc

sort user_id loc_cor trip_id

gen loc_length=length(loc_cor)  // if length=28, means has info on location

bysort user_id loc_cor: gen temp=_n if loc_length==28
tab temp
gen familiar_trip=1 if temp>1 & temp~=.
replace familiar_trip=0 if familiar_trip==.

sort user_id trip_id

drop long_end-temp






******************************
* NM in a familiar trip or not 
******************************



gen total_real_hb1_familiar=total_real_hard_brake_1 if familiar_trip==1
replace total_real_hb1_familiar=0 if total_real_hb1_familiar==.

gen total_real_hb1_nonfamiliar=total_real_hard_brake_1 if familiar_trip==0
replace total_real_hb1_nonfamiliar=0 if total_real_hb1_nonfamiliar==.


gen total_prev_hb_familiar=total_prev_hard_brake if familiar_trip==1
replace total_prev_hb_familiar=0 if total_prev_hb_familiar==.

gen total_prev_hb_nonfamiliar=total_prev_hard_brake if familiar_trip==0
replace total_prev_hb_nonfamiliar=0 if total_prev_hb_nonfamiliar==.

gen total_hard_hb_familiar=total_hard_brake if familiar==1
replace total_hard_hb_familiar=0 if total_hard_hb_familiar==.

gen total_hard_hb_nonfamiliar=total_hard_brake if familiar==0
replace total_hard_hb_nonfamiliar=0 if total_hard_hb_nonfamiliar==.


gen total_hard_turn_familiar=total_hard_turn if familiar==1
replace total_hard_turn_familiar=0 if total_hard_turn_familiar==.

gen total_hard_turn_nonfamiliar=total_hard_turn if familiar==0
replace total_hard_turn_nonfamiliar=0 if total_hard_turn_nonfamiliar==.


**************************************
* Second Step: generate day-level data
**************************************

order user_id trip_start_date

sort user_id trip_start_date start_time_new


* hard brakes
bysort user_id trip_start_date: egen day_total_real_hard_brake_1=total(total_real_hard_brake_1)
bysort user_id trip_start_date: egen day_total_real_hard_brake_2=total(total_real_hard_brake_2)
bysort user_id trip_start_date: egen day_total_prev_hard_brake=total(total_prev_hard_brake)
bysort user_id trip_start_date: egen day_total_hard_brake=total(total_hard_brake)

bysort user_id trip_start_date: egen day_total_real_hb1_familiar=total(total_real_hb1_familiar)
bysort user_id trip_start_date: egen day_total_prev_hb_familiar=total(total_prev_hb_familiar)
bysort user_id trip_start_date: egen day_total_hard_hb_familiar=total(total_hard_hb_familiar)

bysort user_id trip_start_date: egen day_total_real_hb1_nonfamiliar=total(total_real_hb1_nonfamiliar)
bysort user_id trip_start_date: egen day_total_prev_hb_nonfamiliar=total(total_prev_hb_nonfamiliar)
bysort user_id trip_start_date: egen day_total_hard_hb_nonfamiliar=total(total_hard_hb_nonfamiliar)



bysort user_id trip_start_date: egen day_total_hard_brake_1=total(total_hard_brake_1)
bysort user_id trip_start_date: egen day_total_hard_brake_2=total(total_hard_brake_2)
bysort user_id trip_start_date: egen day_total_hard_brake_3=total(total_hard_brake_3)


bysort user_id trip_start_date: egen day_total_hb_close_turn_1=total(total_hard_brake_close_turn_1)
bysort user_id trip_start_date: egen day_total_hb_close_turn_2=total(total_hard_brake_close_turn_2)
bysort user_id trip_start_date: egen day_total_hb_close_turn_3=total(total_hard_brake_close_turn_3)


* hard turns
bysort user_id trip_start_date: egen day_total_hard_left=total(total_hard_left)
bysort user_id trip_start_date: egen day_total_hard_right=total(total_hard_right)
bysort user_id trip_start_date: egen day_total_hard_u=total(total_hard_u)
bysort user_id trip_start_date: egen day_total_hard_turn=total(total_hard_turn)


bysort user_id trip_start_date: egen day_total_hard_turn_familiar=total(total_hard_turn_familiar)
bysort user_id trip_start_date: egen day_total_hard_turn_nonfamiliar=total(total_hard_turn_nonfamiliar)



* risky behaviors
bysort user_id trip_start_date: egen day_total_agg_acc=total(total_agg_acc)
bysort user_id trip_start_date: egen day_total_agg_acc_alone=total(total_agg_acc_alone)
bysort user_id trip_start_date: egen day_total_real_agg_acc_1=total(total_real_agg_acc_1)
bysort user_id trip_start_date: egen day_total_real_agg_acc_2=total(total_real_agg_acc_2)
bysort user_id trip_start_date: egen day_total_agg_acc_only=total(total_agg_acc_only)


bysort user_id trip_start_date: egen day_total_phone_use=total(total_phone_use)
bysort user_id trip_start_date: egen day_total_phone_use_sec=total(total_phone_use_sec)
bysort user_id trip_start_date: egen day_total_traffic_jam=total(total_traffic_jam)
bysort user_id trip_start_date: egen day_total_traffic_sec=total(total_traffic_sec)
bysort user_id trip_start_date: egen day_total_acc=total(total_acc)

* distance, duration, and speed
bysort user_id trip_start_date: egen day_total_distance=total(distance)
bysort user_id trip_start_date: egen day_total_duration=total(duration)
gen day_avg_speed=day_total_distance/day_total_duration

bysort user_id trip_start_date: egen day_avg_speed2=mean(speed)

* drive at night
gen night_duration=duration if drive_at_night==1
bysort user_id trip_start_date: egen day_total_night_duration=total(night_duration)
gen night_duration_prop=day_total_night_duration/day_total_duration
gen day_drive_at_night=1 if day_total_night_duration>0
replace day_drive_at_night=0 if day_total_night_duration==0

* driving scores
bysort user_id trip_start_date: egen day_avg_control_score=mean(control_score)
bysort user_id trip_start_date: egen day_avg_cautious_score=mean(cautious_score)
bysort user_id trip_start_date: egen day_avg_focused_score=mean(focused_score)
bysort user_id trip_start_date: egen day_avg_drive_score=mean(drive_score)





* highway and rush hour
bysort user_id trip_start_date: egen day_total_highway=total(highway)
bysort user_id trip_start_date: egen day_rush_hour=total(rush_hour)



* whether this day has a familiar trip or not
bysort user_id trip_start_date: egen day_total_familiar_trip=total(familiar_trip)
gen day_familiar_trip_dummy=1 if day_total_familiar_trip>0
replace day_familiar_trip_dummy=0 if day_familiar_trip_dummy==.

bysort user_id trip_start_date: gen day_total_trip=_N




** sort using trip_start_time!!!! make sure the code can be replicated exactly
sort user_id trip_start_date start_time_new
bysort user_id trip_start_date: keep if _n==1 

sum day_total_distance, d
tab day_total_distance if day_total_distance>1000, m

sum day_total_duration, d
tab day_total_duration if day_total_duration>24, m

sum day_avg_speed, d
tab day_avg_speed if day_avg_speed>200, m
replace day_avg_speed=. if day_avg_speed>200


sum day_avg_control_score, d
sum day_avg_cautious_score, d
sum day_avg_focused_score, d
sum day_avg_drive_score, d

* drop trip level variables
drop trip_id start_time end_time longitude* latitude*  start_time_new end_time_new
drop control_score cautious_score focused_score drive_score
drop distance total_real_hard_brake_1 total_real_hard_brake_2 total_prev_hard_brake ///
	 total_hard_brake total_hard_brake_1 total_hard_brake_2 total_hard_brake_3 ///
	 total_hard_brake_close_turn_1 total_hard_brake_close_turn_2 total_hard_brake_close_turn_3 ///
	 total_hard_left total_hard_right total_hard_u total_hard_turn     
drop total_agg_acc total_agg_acc_alone total_real_agg_acc_1 total_real_agg_acc_2 total_agg_acc_only
drop total_phone_use total_traffic_jam total_phone_use_sec total_traffic_sec ///
	 total_acc drive_at_night duration night_duration speed hour rush_hour highway date gap_time familiar_trip 
drop total_real_hb1_familiar total_real_hb1_nonfamiliar total_prev_hb_familiar total_prev_hb_nonfamiliar ///
	 total_hard_hb_familiar total_hard_hb_nonfamiliar total_hard_turn_familiar total_hard_turn_nonfamiliar


********* Rename Day Level Variables

rename day_total_real_hard_brake_1 total_real_hard_brake_1
rename day_total_real_hard_brake_2 total_real_hard_brake_2
rename day_total_prev_hard_brake total_prev_hard_brake
rename day_total_hard_brake total_hard_brake


rename day_total_real_hb1_familiar total_real_hb1_familiar
rename day_total_real_hb1_nonfamiliar total_real_hb1_nonfamiliar
rename day_total_prev_hb_familiar total_prev_hb_familiar
rename day_total_prev_hb_nonfamiliar total_prev_hb_nonfamiliar
rename day_total_hard_hb_familiar total_hard_hb_familiar
rename day_total_hard_hb_nonfamiliar total_hard_hb_nonfamiliar



rename day_total_hard_brake_1 total_hard_brake_1
rename day_total_hard_brake_2 total_hard_brake_2
rename day_total_hard_brake_3 total_hard_brake_3

rename day_total_hb_close_turn_1 total_hard_brake_close_turn_1
rename day_total_hb_close_turn_2 total_hard_brake_close_turn_2
rename day_total_hb_close_turn_3 total_hard_brake_close_turn_3



rename day_total_hard_left total_hard_left
rename day_total_hard_right total_hard_right
rename day_total_hard_u total_hard_u
rename day_total_hard_turn total_hard_turn

rename day_total_hard_turn_familiar total_hard_turn_familiar
rename day_total_hard_turn_nonfamiliar total_hard_turn_nonfamiliar


rename day_total_agg_acc total_agg_acc
rename day_total_agg_acc_alone total_agg_acc_alone
rename day_total_real_agg_acc_1 total_real_agg_acc_1 
rename day_total_real_agg_acc_2 total_real_agg_acc_2 
rename day_total_agg_acc_only total_agg_acc_only

rename day_total_phone_use total_phone_use
rename day_total_phone_use_sec total_phone_use_sec
rename day_total_traffic_jam total_traffic_jam
rename day_total_traffic_sec total_traffic_sec
rename day_total_acc total_acc
rename day_total_distance distance
rename day_total_duration duration
rename day_avg_speed speed
rename day_avg_speed2 speed2
rename day_total_night_duration total_night_duration
rename day_drive_at_night drive_at_night
rename day_avg_control_score control_score
rename day_avg_focused_score focused_score
rename day_avg_cautious_score cautious_score
rename day_avg_drive_score drive_score

rename day_total_highway highway
rename day_rush_hour rush_hour

rename day_total_familiar_trip total_familiar_trip
rename day_familiar_trip_dummy familiar_trip_dummy
rename day_total_trip total_trip

sort user_id trip_start_date
bysort user_id: gen day_index=_n

tsset user_id day_index
gen trip_start_date_new=date(trip_start_date, "YMD")

gen gap_days=trip_start_date_new-l.trip_start_date_new

drop trip_start_date_new day_index
rename gap_days gap_time  



** drop missing values

sum total_hard_brake total_hard_brake_1 total_hard_brake_2 total_hard_brake_3 ///
	total_hard_turn total_hard_left total_hard_right total_hard_u ///
	total_prev_hard_brake total_real_hard_brake_1 total_real_hard_brake_2 ///
	total_prev_hb_familiar total_real_hb1_familiar total_hard_hb_familiar total_hard_turn_familiar ///
	total_prev_hb_nonfamiliar total_real_hb1_nonfamiliar total_hard_hb_nonfamiliar total_hard_turn_nonfamiliar
	

sum	control_score cautious_score focused_score ///
	weekend rush_hour total_traffic_jam ///
	high_temper low_temper sunny cloudy rain_storm rain snow windy foggy  

	
sum total_agg_acc total_agg_acc_alone total_real_agg_acc_1 total_real_agg_acc_2 total_agg_acc_only ///
	total_phone_use distance duration speed drive_at_night highway 	
	

drop if high_temper==. | low_temp==. | speed==.
	

save working_data/clean_day_level_geo_weather_familiar_012020.dta, replace
